Project Phase Description

In the second phase try to segment visitors into separate categories / segments. Try to answer following questions:

Loading libraries

library(tidyr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag
## 
## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
library(reshape)
## 
## Attaching package: 'reshape'
## 
## Następujący obiekt został zakryty z 'package:lubridate':
## 
##     stamp
## 
## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     rename
## 
## Następujący obiekt został zakryty z 'package:tidyr':
## 
##     expand
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     select
library(cluster)
library(pvclust)
library(dendextend)
## 
## Welcome to dendextend version 1.1.2
## 
## Type ?dendextend to access the overall documentation and
## browseVignettes(package = 'dendextend') for the package vignette.
## You can execute a demo of the package via: demo(dendextend)
## 
## More information is available on the dendextend project web-site:
## https://github.com/talgalili/dendextend/
## 
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## 
##          To suppress the this message use:
##          suppressPackageStartupMessages(library(dendextend))
## 
## 
## Attaching package: 'dendextend'
## 
## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     %>%
## 
## Następujący obiekt został zakryty z 'package:tidyr':
## 
##     %>%
## 
## Następujący obiekt został zakryty z 'package:stats':
## 
##     cutree
library(ape)
## 
## Attaching package: 'ape'
## 
## Następujące obiekty zostały zakryte z 'package:dendextend':
## 
##     ladderize, rotate
library(RColorBrewer)
library(scales)
library(colorspace) # get nice colors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## Następujący obiekt został zakryty z 'package:dendextend':
## 
##     %>%
## 
## Następujący obiekt został zakryty z 'package:ggplot2':
## 
##     last_plot
## 
## Następujący obiekt został zakryty z 'package:graphics':
## 
##     layout
library(stringdist)

Loading data

load("C:\\Users\\Karola\\Documents\\BISD\\Semestr 2\\Data Mining\\Projekt 2\\verySmallLogs.rda")
#Transforming data in the same way as in phase 1

data <- verySmallLogs %>% 
  mutate(station,
         visitor,
         type,
         date,
         weekday = wday(date, label=TRUE, abbr=FALSE),
         hour = substr(date, 12, 13))

data$date <- as.POSIXct(data$date,format="%Y-%m-%d %H:%M:%S")
data = data[data$visitor != -1,]


dataEntering <- data[data$type=="Entering" & data$visitor != -1,]
dataLeaving <- data[data$type=="Leaving" & data$visitor != -1,]

newdataEntering = dataEntering %>%
  group_by(visitor, station) %>%
  summarise(min_date=min(date),
            weekday=head(weekday,1),
            hour=head(hour,1),
            count = n())

newdataLeaving = dataLeaving %>%
  group_by(visitor, station) %>%
  summarise(max_date=max(date),
            weekday=head(weekday,1),
            hour=head(hour,1),
            count = n())

mergedData <- merge(newdataEntering, newdataLeaving, by=c("visitor", "station"))
mergedData$time = as.numeric(mergedData$max_date-mergedData$min_date)
mergedData$hour = mergedData$hour.x
mergedData$weekday = mergedData$weekday.x
mergedData$count = mergedData$count.x
mergedData = mergedData[mergedData$time < 800,]

Data plot

ggplot(data=mergedData,aes(count, time, station))+geom_point(aes(colour = station))

ggplot(data=mergedData,aes(weekday, count, station))+geom_point(aes(colour = station))

Basic hierarchical clustering

Choose dinstance metric (scale and use manhattan)

mat1 <- dist(mergedData[1:100,c("count","time")])
as.matrix(mat1)[1:5,1:5]

mat2 <- dist(scale(mergedData[1:100,c("count","time")]))
as.matrix(mat2)[1:5,1:5]

mat3 <- dist(scale(mergedData[1:100,c("count","time")]), method="manhattan")
as.matrix(mat3)[1:5,1:5]

dat <- scale(mergedData[,c("count","time")])

Prepare data for hierarchical clustering

sampleData <- mergedData[sample(nrow(mergedData), 3000),]
head(sampleData)
sampleData <- transform(sampleData, 
                        visitor = as.numeric(visitor), 
                        station = as.numeric(station),
                        min_date = as.numeric(min_date),
                        max_date = as.numeric(max_date),
                        label = paste(station, visitor, sep="_"))

rownames(sampleData) <- sampleData$label
dat <- scale(sampleData[,c("count", "time", "station")])

Plot sampledData to see how it looks like

ggplot(data=sampleData,aes(count, time, weekday))+geom_point(aes(colour = weekday))

ggplot(data=sampleData,aes(count, time, station))+geom_point(aes(colour = station))

Cluster using Agnes package

hc <- agnes(dat, method="ward", metric = "manhattan")
dend <- as.dendrogram(hc)
dend <- color_branches(dend, k=3)
# plot(dend, horiz =  TRUE,  nodePar = list(cex = .007))
plot(cut(dend, h = 15)$upper, horiz =  FALSE, cex=0.01)

# plot(hc, which.plots=2, cex=0.1, main="", xlab="")
sampleData$labels = factor(cutree(dend, k=3))
ggplot(sampleData, aes(count, time, label=station, color=labels))+geom_text(size=4)+theme_bw()

plot_ly(sampleData, x = count, y = time, z = station, type = "scatter3d", mode = "markers", color = labels)

GRUPA 3 -

Cluster using hclust package

dat <- scale(sampleData[,c("count", "time", "station")])
hc <- hclust(dist(dat, method = "manhattan"), "ward.D")
sampleData$labels = factor(cutree(hc, k=3))

ggplot(sampleData, aes(count, time, label=station, color=labels))+geom_text(size=4)+theme_bw()

plot_ly(sampleData, x = count, y = time, z = station, type = "scatter3d", mode = "markers", color = labels)

Add station to features

sampleData <- mergedData[sample(nrow(mergedData), 3000),]
sampleData <- transform(sampleData, 
                        visitor = as.numeric(visitor), 
                        station = as.numeric(station),
                        min_date = as.numeric(min_date),
                        max_date = as.numeric(max_date),
                        label = paste(station, visitor, sep="_"))
dat <- scale(sampleData[,c("count","time", "visitor", "min_date", "max_date", "station")])
hc <- hclust(dist(dat, method = "manhattan"), "ward.D")
#plot(hc, labels = FALSE)
sampleData$labels = factor(cutree(hc, k=3))
ggplot(sampleData, aes(count, time, label=station, color=labels))+geom_text(size=3)+theme_bw()

plot_ly(sampleData, x = count, y = time, z = station, type = "scatter3d", mode = "markers", color = labels)

This is how we have clustered our data for now (using 3 groups)

Introduce new features

Try to get data by visitor (not by visitor, station) and introduce new features

  1. Time dependent features
    • total_time
    • max_time
    • min_time
    • weekday
    • hour
    • min_date
    • max_date
  2. station dependent features
    • most_freq_station
    • least_freq_station
    • min_count
    • max_count
    • total_count

Calculate new features

phase1Data <- mergedData[,c("visitor", "station", "max_date", "min_date", "time", "weekday", "hour", "count")]

phase1Data = phase1Data %>%
  group_by(visitor) %>%
  summarise(max_date=max(max_date),
            min_date=min(min_date),
            total_time=sum(time),
            min_time=min(time),
            max_time=max(time),
            weekday=head(weekday,1),
            hour=head(hour,1),
            total_count = sum(count),
            max_count = max(count),
            min_count = min(count),
            most_freq_station = head(station[which(count == max(count))],1),
            least_freq_station = head(station[which(count == min(count))],1))

sampleData <- phase1Data[sample(nrow(phase1Data), 2000),]
sampleData <- sampleData[order(sampleData$visitor),]
rownames(sampleData) <- sampleData$visitor

Scaling data and transforming to numeric

kmeansData <- transform(sampleData, 
                        visitor = as.numeric(visitor),
                        max_date = as.POSIXlt(max_date)$hour + as.POSIXlt(max_date)$min/60,
                        min_date = as.POSIXlt(min_date)$hour + as.POSIXlt(min_date)$min/60,
                        total_time = as.numeric(total_time),
                        min_time = as.numeric(min_time),
                        max_time = as.numeric(max_time),
                        #weekday = as.numeric(weekday),
                        hour = as.numeric(hour),
                        #most_freq_station = as.numeric(most_freq_station),
                        #least_freq_station = as.numeric(least_freq_station),
                       label = visitor)
 kmeansData$max_date <- scale(kmeansData$max_date)
 kmeansData$min_date <- scale(kmeansData$min_date)
 kmeansData$total_time <- scale(kmeansData$total_time)
 kmeansData$min_time <- scale(kmeansData$min_time)
 kmeansData$max_time <- scale(kmeansData$max_time)
 kmeansData$least_freq_station <- scale(kmeansData$total_count)

Trying to visualize data once more

Plot a SPLOM: (how features depend of each other)

SPLOM_DATA <- kmeansData[,c("total_time", "max_time", "min_time","min_date", "max_date", "min_count", "max_count", "total_count")]
station_col <- rev(rainbow_hcl(65))[as.numeric(sampleData$most_freq_station)]
pairs(SPLOM_DATA, col = station_col,
      lower.panel = NULL,
      cex.labels=1, pch=15, cex = 0.75)

Use K-means on all numeric features:

set.seed(4)
model1 <- kmeans(kmeansData[,c("total_time", "max_time", "min_time","min_date", "max_date", "min_count", "max_count", "total_count")], 4)
kmeansData$cluster <- factor(model1$cluster)
nd <- data.frame(model1$centers)

ggplot(kmeansData, aes(total_time, total_count)) +
  geom_text(size=3, aes(label=most_freq_station, color=cluster)) + 
  geom_point(data=nd, size=3)+
  theme_bw()

plot_ly(kmeansData, x = max_time, y = total_count, z = total_time, type = "scatter3d", mode = "markers", color = cluster)

plot_ly(kmeansData, x = total_count, y = min_time, z = total_time, type = "scatter3d", mode = "markers", color = cluster)

plot_ly(kmeansData, x = total_count, y = min_date, z = total_time, type = "scatter3d", mode = "markers", color = cluster)

Use k-means on features: total_time, total_count

set.seed(4)
model1 <- kmeans(kmeansData[,c("total_time", "total_count")], 4)
kmeansData$cluster <- factor(model1$cluster)
nd <- data.frame(model1$centers)
ggplot(kmeansData, aes(total_time, total_count)) +
  geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
  geom_point(data=nd, size=3)+
  theme_bw()

First group

kmFirstGroup = kmeansData[kmeansData$cluster == 1,] 

Long time of use and frequent interactions with machines

qplot(kmFirstGroup$most_freq_station)+geom_bar() 

Machines which are used most frequent: 10, 05, 56, 66 Least which are used least frequent: 18, 20

Second group

kmSecondGroup = kmeansData[kmeansData$cluster == 2,]

Rather medium and short times of interactions, the number of interactions is small

qplot(kmSecondGroup$most_freq_station)+geom_bar()

Similar to first group, however ‘cnk18’ is used more frequent, ‘cnk19’ is used less frequent. ‘cnk20’ is the most frequently used machine. Members, who use machine for short time, chooses ‘cnk20’ machine more frequently than people who use machine for long time.

Third group

kmThirdGroup = kmeansData[kmeansData$cluster == 3,] 

Result is similar to the result from second group.

qplot(kmThirdGroup$most_freq_station)+geom_bar()

Fourth group

kmFourthGroup = kmeansData[kmeansData$cluster == 4,] 

Members who use machine only for short time and they do have small number of iterations

qplot(kmFourthGroup$most_freq_station)+geom_bar() 

In most cases, members choose ‘19a’ machine, completely different than people who spend long times while using machines.

Using pam on all features:

  • max_time,
  • total_time
  • total_count
  • most_freq_station
  • min_date
  • weekday
  • least_freq_station
  • pamData <- transform(sampleData, 
                            max_date = as.POSIXlt(max_date)$hour + as.POSIXlt(max_date)$min/60,
                            min_date = as.POSIXlt(min_date)$hour + as.POSIXlt(min_date)$min/60,
                            total_time = as.numeric(total_time),
                            min_time = as.numeric(min_time),
                            max_time = as.numeric(max_time),                        label = visitor)
    
    model4 <- pam(pamData[,c("total_time", "total_count")], 4)
    pamData$cluster <- factor(model4$clustering)
    nd <- data.frame(model4$medoids)
    
    ggplot(pamData, aes(total_time, total_count)) +
      geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
      # geom_point(data=nd, size=3)+
      theme_bw()

    # plot_ly(pamData, x = total_time, y = total_count, z = most_freq_station, type = "scatter3d", mode = "markers", color = cluster)
    # plot_ly(pamData, x = total_time, y = hour, z = most_freq_station, type = "scatter3d", mode = "markers", color = cluster)
    # plot_ly(pamData, x = total_count, y = min_date, z = weekday, type = "scatter3d", mode = "markers", color = cluster)

Pca component analysis

pc <- prcomp(SPLOM_DATA)
comp <- data.frame(pc$x[,1:4])
plot(comp, pch=16, col=rgb(0,0,0,0.5))

k <- kmeans(comp, 4, nstart=25, iter.max=1000)
plot(comp, col=k$clust, pch=16)

summary(pc)
## Importance of components:
##                            PC1    PC2     PC3    PC4     PC5     PC6
## Standard deviation     24.9358 7.5305 4.26549 1.3992 1.05410 0.54217
## Proportion of Variance  0.8881 0.0810 0.02599 0.0028 0.00159 0.00042
## Cumulative Proportion   0.8881 0.9691 0.99513 0.9979 0.99951 0.99993
##                            PC7     PC8
## Standard deviation     0.16366 0.14785
## Proportion of Variance 0.00004 0.00003
## Cumulative Proportion  0.99997 1.00000
biplot(pc)

Finding a pattern of stations visited by user

We introduce new features and distance metric:
patternData <- mergedData[,c("visitor", "station", "time", "count", "min_date")]
head(patternData)
##   visitor station time count            min_date
## 1  795453   cnk61   18     5 2012-01-03 09:53:00
## 2  795455   cnk10   88    12 2012-01-03 10:00:20
## 3  795455   cnk66   92    17 2012-01-03 09:58:29
## 4  795467   cnk10   34     4 2012-01-03 09:45:06
## 5  795476   cnk18   31     8 2012-01-03 09:41:09
## 6  795476   cnk38   82    21 2012-01-03 10:21:33
patternData = patternData %>%
  arrange(min_date) %>%
  group_by(visitor) %>%
  summarise(total_time=sum(time),
            total_count = sum(count),
            first_station = head(station,1),
            last_station = tail(station,1),
            station_path = paste(station, collapse="_"),
            most_freq_station = head(station[which(count == max(count))],1),
            least_freq_station = head(station[which(count == min(count))],1))
           
sampleData <- patternData[sample(nrow(patternData), 2000),]
sampleData <- sampleData[order(sampleData$visitor),]
rownames(sampleData) <- sampleData$visitor

Clustering on station_path using hierarchicla clustering with restricted Damerau-Levenshtein distance

Cluster into 3 groups

d <- stringdistmatrix(sampleData$station_path, sampleData$station_path)
cl <- hclust(as.dist(d))
#plot(cl)
sampleData$labels = factor(cutree(cl, k=3))
ggplot(sampleData, aes(total_count, total_time, label=most_freq_station, color=labels))+geom_text(size=3)+theme_bw()

ggplot(sampleData, aes(total_count, total_time, color=labels))+geom_point(size=2)+theme_bw()

First group station paths:

firstGroup = (sampleData %>% filter(labels == 1))
qplot(firstGroup$most_freq_station)+geom_bar() 

Use of machine is equally distributed

qplot(firstGroup$total_time)+geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Domination of short times (below 500)

head(firstGroup[,c("station_path")])
## Source: local data frame [6 x 1]
## 
##              station_path
##                     (chr)
## 1             cnk61_cnk56
## 2 cnk66_cnk20_cnk05_cnk56
## 3                   cnk18
## 4                   cnk18
## 5 cnk05_cnk10_cnk66_cnk18
## 6 cnk10_cnk20_cnk18_cnk61

Mainly short processes, mostly 1 or 2 machines used

Second group station paths:

secondGroup = (sampleData %>% filter(labels == 2))
qplot(secondGroup$most_freq_station)+geom_bar() 

Some machines, for example cnk19a are not used by users from the same group. On the other hand ‘cnk66’, ‘cnk05’ are used very frequently.

qplot(secondGroup$total_time)+geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Total times are medium ones and are distrbitued with normal distribution (around 500)

head(secondGroup[,c("station_path")]) 
## Source: local data frame [6 x 1]
## 
##                                 station_path
##                                        (chr)
## 1  cnk10_cnk05_cnk66_cnk20_cnk38_cnk56_cnk61
## 2  cnk66_cnk10_cnk20_cnk05_cnk18_cnk61_cnk56
## 3 cnk10_cnk05_cnk19a_cnk66_cnk20_cnk18_cnk61
## 4       cnk19a_cnk05_cnk66_cnk20_cnk38_cnk56
## 5       cnk19a_cnk10_cnk66_cnk20_cnk05_cnk18
## 6       cnk19a_cnk20_cnk66_cnk10_cnk18_cnk61

Users of this group are machines in random way. They start mostly on machines: ‘20’, ‘05’, ‘10’ and ends on ‘56’ or ‘38’. Also they use machines ‘66’, ‘61’, ‘18’, ‘20’ in random order.

Third group station paths:

thirdGroup = (sampleData %>% filter(labels == 3))
qplot(thirdGroup$most_freq_station)+geom_bar() 

Users of this group uses machine more randomly, ‘cnk18’ is almost not used by group members. On the other hand cnk38 is used very often. Those people uses also cnk66 and cnk05.

qplot(thirdGroup$total_time)+geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Total usage times are large (above 500)

head(thirdGroup[,c("station_path")]) 
## Source: local data frame [6 x 1]
## 
##                          station_path
##                                 (chr)
## 1       cnk20_cnk66_cnk61_cnk38_cnk18
## 2 cnk05_cnk66_cnk10_cnk20_cnk61_cnk56
## 3 cnk05_cnk10_cnk66_cnk18_cnk20_cnk56
## 4 cnk05_cnk10_cnk20_cnk66_cnk18_cnk38
## 5 cnk05_cnk66_cnk20_cnk18_cnk56_cnk38
## 6       cnk10_cnk05_cnk66_cnk18_cnk61

Group members starts mostly on ‘cnk10’ or ‘cnk05’ machine and then they play on ‘cnk66’, ‘cnk20’, ‘cnk18’ and ‘cnk61’ machines. At the end they finish on ‘cnk56 and ’cnk38’.

Cluster into 100 groups

sampleData$labels = factor(cutree(cl, k=100))

Information for 3 groups

firstGroup = (sampleData %>% filter(labels == 1))
length(unique(firstGroup$station_path))
## [1] 62
length(firstGroup$station_path)
## [1] 511
unique(firstGroup$station_path)
##  [1] "cnk61_cnk56"  "cnk38_cnk18"  "cnk10_cnk05"  "cnk66_cnk20" 
##  [5] "cnk20_cnk61"  "cnk20_cnk66"  "cnk56_cnk05"  "cnk18_cnk61" 
##  [9] "cnk05_cnk10"  "cnk56_cnk61"  "cnk66_cnk18"  "cnk20_cnk56" 
## [13] "cnk10_cnk66"  "cnk05_cnk18"  "cnk05_cnk56"  "cnk10_cnk61" 
## [17] "cnk05_cnk66"  "cnk10_cnk56"  "cnk20_cnk18"  "cnk18_cnk56" 
## [21] "cnk66_cnk61"  "cnk05_cnk20"  "cnk10_cnk20"  "cnk18_cnk38" 
## [25] "cnk18_cnk20"  "cnk66_cnk56"  "cnk38_cnk56"  "cnk10_cnk18" 
## [29] "cnk05_cnk61"  "cnk20_cnk10"  "cnk66_cnk38"  "cnk20_cnk38" 
## [33] "cnk61_cnk18"  "cnk56_cnk10"  "cnk18_cnk10"  "cnk18_cnk66" 
## [37] "cnk19a_cnk20" "cnk05_cnk19a" "cnk05_cnk38"  "cnk19a_cnk18"
## [41] "cnk19a_cnk38" "cnk19a_cnk10" "cnk10_cnk19a" "cnk19a_cnk66"
## [45] "cnk19a_cnk61" "cnk61_cnk66"  "cnk18_cnk19a" "cnk56_cnk19a"
## [49] "cnk66_cnk19a" "cnk19a_cnk56" "cnk66_cnk10"  "cnk61_cnk19a"
## [53] "cnk61_cnk38"  "cnk19a_cnk05" "cnk10_cnk38"  "cnk18_cnk05" 
## [57] "cnk20_cnk19a" "cnk38_cnk20"  "cnk38_cnk61"  "cnk38_cnk19a"
## [61] "cnk61_cnk20"  "cnk38_cnk10"
secondGroup = (sampleData %>% filter(labels == 3))
length(unique(secondGroup$station_path))
## [1] 9
length(secondGroup$station_path)
## [1] 811
unique(secondGroup$station_path)
## [1] "cnk18"  "cnk05"  "cnk66"  "cnk20"  "cnk56"  "cnk61"  "cnk10"  "cnk38" 
## [9] "cnk19a"
thirdGroup = (sampleData %>% filter(labels == 7))
length(unique(thirdGroup$station_path))
## [1] 24
length(thirdGroup$station_path)
## [1] 60
unique(thirdGroup$station_path)
##  [1] "cnk05_cnk10_cnk66"  "cnk05_cnk20_cnk56"  "cnk05_cnk10_cnk18" 
##  [4] "cnk05_cnk66_cnk38"  "cnk05_cnk18_cnk61"  "cnk05_cnk66_cnk18" 
##  [7] "cnk05_cnk20_cnk18"  "cnk05_cnk10_cnk56"  "cnk05_cnk66_cnk20" 
## [10] "cnk05_cnk10_cnk61"  "cnk05_cnk19a_cnk66" "cnk05_cnk19a_cnk18"
## [13] "cnk05_cnk19a_cnk56" "cnk05_cnk61_cnk56"  "cnk05_cnk19a_cnk38"
## [16] "cnk05_cnk19a_cnk20" "cnk05_cnk19a_cnk61" "cnk05_cnk38_cnk56" 
## [19] "cnk05_cnk20_cnk10"  "cnk05_cnk66_cnk10"  "cnk05_cnk20_cnk66" 
## [22] "cnk05_cnk18_cnk66"  "cnk05_cnk20_cnk38"  "cnk05_cnk61_cnk18"

Summary

We distinguished following similarity measures:
  1. Time dependent features
    • total_time
    • max_time
    • min_time
    • weekday
    • hour
    • min_date
    • max_date
  2. station dependent features
    • most_freq_station
    • least_freq_station
    • min_count
    • max_count
    • total_count

The population is heterogeneous.

Research carried out on the supplied data set showed that it is possible to distinguish four groups that show signs of similarities.

As an example, one of the group has following features. Team members uses machines for short time and total numer of used machines is relatively low. The other group members plays much longer then previous one. Moreover, one can determine a specific path of their usage (starting on specific machine, ending in some machine).

Histogram from the document shows that machines, which were used by people, who plays only few times are not used by team members which uses machine very frequently. To be more specific - number of usage of machine ‘19a’ is small, furthermore it is used by such players, not by addicted people.

Second conclusion is that there exists group, which uses machine only few times and then they give up (resign) due to fact, that they lose some amount of money or they just wanted to try.

Another fact is that another group can be indetified. Its members develop some schemas and strategies (using specific machines, specific ‘paths through machines’). Such members can be considered as ‘Addicts’. Machines which are used most frequent: 10, 05, 56, 66 Least which are used least frequent: 18, 20

From researcher point of view, it is possible to point out, that there exists group which uses machines occasionally - ‘Occasional players’. Frequency of playing is smaller than frequency of the aforementioned group. Usage of machines is similar to ‘Addicts’ group, however ‘cnk18’ is used more frequently, ‘cnk19’ is used less frequently. ‘cnk20’ is the most frequently used machine. Members, who use machine for short time, chooses ‘cnk20’ machine more frequently than people who use machine for long time.

Machine patterns: If we increase the number of classes, we will obtain 100 groups of people with very similar preferences, eg. single person whose behaviour is similar every time. In this way people are clustered together when: they have the same length of their path, visit similar machines and start from the same station.